package action.RDD创建操作

import org.apache.spark.{SparkConf, SparkContext}

import scala.util.Random

/**
  *
  * @author wdmcode@aliyun.com
  * @version 1.0.0
  * @date 2018/11/8
  */
object GroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test").setMaster("local[4]")
    var numMappers = 100
    var numKVPairs = 10000
    var valSize = 1000
    var numReducers = 36

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache
    // Enforce that everything has been calculated and in cache
//    println(pairs1.count)

    println(pairs1.groupByKey(numReducers).count)

    println(pairs1.toDebugString)
    sc.stop()
  }
}
